Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ape)
library(msa) ## need XVector, Biostrings. All through biocLite
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, regroup, slice
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
library(seqinr)
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
## The following objects are masked from 'package:dplyr':
##
## count, query
library(RColorBrewer)
Load data
df = read.table("140117.cdr12.txt", header = T, sep = "\t", stringsAsFactors = F) %>%
filter(species == "HomoSapiens", grepl("TR[AB].+\\*01", gene)) %>%
dplyr::select(gene, seqaa, cdr1aa, cdr2aa) %>%
mutate(chain = ifelse(grepl("TRA", gene), "TRA", "TRB"),
gene = gsub("\\*01", "", gene))
df.prob = read.table("cdr.contact.txt", header = T, sep = "\t")
Set up plotting
df.trb = subset(df, chain == "TRB")
clrs = colorRampPalette(rev(brewer.pal(9, 'Paired')))(length(df.trb$gene))
seq = df.trb$seqaa
names(seq) = df.trb$gene
trbAln = msa(AAStringSet(seq))
## use default substitution matrix
print(trbAln)
## CLUSTAL 2.1
##
## Call:
## msa(AAStringSet(seq))
##
## MsaAAMultipleAlignment with 64 rows and 99 columns
## aln names
## [1] DAEITQSPRHKITETGRQVTLAC...PLTLESAASSQTSVYFCASSE- TRBV10-1
## [2] DAGITQSPRHKVTETGTPVTLRC...LLTLESATSSQTSVYFCAISE- TRBV10-3
## [3] DAGITQSPRYKITETGRQVTLMC...PLTLESATRSQTSVYFCASSE- TRBV10-2
## [4] NAGVTQTPKFRVLKTGQSMTLLC...LLGLESAAPSQTSVYFCASSY- TRBV6-2
## [5] NAGVTQTPKFRVLKTGQSMTLLC...LLGLESAAPSQTSVYFCASSY- TRBV6-3
## [6] NAGVTQTPKFQVLKTGQSMTLQC...SLRLESAAPSQTSVYFCASSE- TRBV6-1
## [7] NAGVTQTPKFHILKTGQSMTLQC...PLRLVSAAPSQTSVYLCASSY- TRBV6-8
## [8] NAGVTQTPKFHILKTGQSMTLQC...PLRLESAAPSQTSVYFCASSY- TRBV6-9
## [9] NAGVTQTPKFQVLKTGQSMTLQC...PLRLLSAAPSQTSVYFCASSY- TRBV6-5
## ... ...
## [57] DARVTQTPRHKVTEMGQEVTMRC...TLKIQPSEPRDSAVYFCASGL- TRBV12-5
## [58] EAGVTQFPSHSVIEKGQTVTLRC...TLKVQPAELEDSGVYFCASSQ- TRBV14
## [59] EPEVTQTPSHQVTQMGQEVILRC...TLKIRSTKLEDSAMYFCASSE- TRBV2
## [60] SQTIHQWPATLVQPVGSPLSLEC...ILSSKKLLLSDSGFYLCAWS-- TRBV30
## [61] GAVVSQHPSWVICKSGTSVKIEC...TLTVTSAHPEDSSFYICSAR-- TRBV20-1
## [62] SAVVSQHPSRVICKSGTSVNIEC...ALTVTSAHPEDSSFYICSAR-- TRBV20/OR9-2
## [63] SAVISQKPSRDICQRGTSLTIQC...TLTVSNMSPEDSSIYLCSVE-- TRBV29-1
## [64] SAVISQKPSRDICQRGTSMMIQC...TLTVSNRRPEDSSIYLCSVE-- TRBV29/OR9-2
## Con ?AGVTQ?P???V???GQ?VTL?C...?L????????DSA?Y?CASS?- Consensus
names(clrs) = names(trbAln@unmasked)
plot_tree = function(seqs, tip_colors = clrs) {
trbAln = msa(AAStringSet(seq))
tip_colors = tip_colors[names(trbAln@unmasked)]
trbAln2 = msaConvert(trbAln, type="seqinr::alignment")
d = dist.alignment(trbAln2, "identity")
trbTree = nj(d)
fig = c(0.1, 0.9, 0, 1.0)
mar = c(0, 0, 0, 0)
par(fig = fig, mar = mar, xpd = NA)#, bg = "grey")
plot(trbTree, type = "fan", tip.color = tip_colors)
trbTree
}
seq = df.trb$seqaa
names(seq) = df.trb$gene
full_phylo = plot_tree(seq)
## use default substitution matrix
seq = df.trb$cdr1aa
names(seq) = df.trb$gene
cdr1_phylo = plot_tree(seq)
## use default substitution matrix
seq = df.trb$cdr2aa
names(seq) = df.trb$gene
cdr2_phylo = plot_tree(seq)
## use default substitution matrix
seq = paste0(df.trb$cdr1aa, df.trb$cdr2aa)
names(seq) = df.trb$gene
cdr12_phylo = plot_tree(seq)
## use default substitution matrix
fig = c(0.1, 0.9, 0, 1.0)
mar = c(0, 0, 0, 0)
par(fig = fig, mar = mar, xpd = NA, cex=0.3)
Full V alignment vs CDR1
cophyloplot(full_phylo, cdr1_phylo, assoc = cbind(df.trb$gene, df.trb$gene), space=200, col = "red")
Full V alignment vs CDR2
cophyloplot(full_phylo, cdr2_phylo, assoc = cbind(df.trb$gene, df.trb$gene), space=200, col = "red")
Full V alignment vs CDR1+2
cophyloplot(full_phylo, cdr12_phylo, assoc = cbind(df.trb$gene, df.trb$gene), space=200, col = "red")
Compute contacts
pos_max = df.prob$pos_max[1]
compute_contacts = function(tcr_chain, tcr_region, region_seq) {
df.seq = data.frame(tcr_chain = tcr_chain,
tcr_region = tcr_region,
aa_tcr = strsplit(region_seq, "")[[1]])
df.seq$pos_rel_tcr = with(df.seq, (1:length(aa_tcr) - 1) / (length(aa_tcr) - 1) * (pos_max - 1))
df.seq = merge(df.seq, df.prob)
sum(df.seq$P)
}
df.cont = df %>%
group_by(gene) %>%
mutate(P = compute_contacts(chain, "CDR1", cdr1aa) + compute_contacts(chain, "CDR2", cdr2aa))
clrs2 = colorRampPalette(rev(brewer.pal(11, 'RdBu')))(101)
df.cont$Pl = rank(df.cont$P)
df.cont$Pind = with(df.cont, 100 * (Pl - min(Pl)) / (max(Pl) - min(Pl)) + 1)
clrs2 = clrs2[df.cont$Pind]
names(clrs2) = df.cont$gene
seq = df.trb$seqaa
names(seq) = df.trb$gene
full_phylo = plot_tree(seq, clrs2)
## use default substitution matrix
df.tra = subset(df, chain == "TRA")
seq = df.tra$seqaa
names(seq) = df.tra$gene
full_phylo = plot_tree(seq, clrs2)
## use default substitution matrix